library(pacman)
pacman::p_load(edgeR, RColorBrewer, gplots, ggplot2, reshape2, DT, cowplot,
               limma, DESeq, DESeq2, data.table, e1071, ComplexHeatmap)

Source functions

source("https://raw.githubusercontent.com/CJinnny/RNA-seq-tutorial/master/RNA_seq_tutorial_functions.R")

Data merging

This data came out of Kallisto software, gene counts are the sum of trancript pseudocounts. These counts are not in round numbers, so we need to round them to integers first.

Here I created my own transcriptome assembly using StringTie https://ccb.jhu.edu/software/stringtie/, hense the “MSTRG.” gene_id.

The reason I did that was because I not only want to study genes found in the reference genome, but also characterize new transcripts.

In order to know gene functions, we can convert MSTRG id into maize reference genome id (v4), which is already well characterized.

require(data.table)
data <- fread("https://raw.githubusercontent.com/CJinnny/RNA-seq-tutorial/master/genes.txt", sep = "\t")
data[,6:15] = round(data[,6:15])

id_conv <- fread("https://raw.githubusercontent.com/CJinnny/RNA-seq-tutorial/master/MSTRG_to_Zm_id.txt", header = FALSE, sep = "\t")
functions <- fread("https://raw.githubusercontent.com/CJinnny/RNA-seq-tutorial/master/B73v4_gene_function.txt", header = FALSE, sep = "\t")
functions <- subset(functions, select = c("V1", "V2"))
colnames(id_conv) <- c("MSTRG_gene_id", "v4_gene_id")
colnames(functions) <- c("v4_gene_id", "gene_function")
annotations = merge(id_conv, functions, by = "v4_gene_id", all.x = TRUE)
data <- merge(data, annotations, by = "MSTRG_gene_id", all.x = TRUE)
data <- aggregate(data, 
                  by = list(data$MSTRG_gene_id),
                  FUN = aggregate_func
                  )
data <- na.omit(data, cols="v4_gene_id")
data <- data.frame(data, row.names = 1)
setnames(data, old = c("UU1", "UU2", "UU3", "UU4", "UU5", "WW1", "WW2", "WW3", "WW4", "WW5"), 
         new = c("U1", "U2","U3","U4","U5", "W1", "W2","W3","W4","W5"))
head(data)
##             MSTRG_gene_id chr_name    start      end strand  U1  U2  U3
## MSTRG.1000     MSTRG.1000     chr1 25047512 25050032      - 195 333 154
## MSTRG.10000   MSTRG.10000    chr10 32649346 32649418      +   0   0   0
## MSTRG.10002   MSTRG.10002    chr10 32585811 32588982      - 154 244 273
## MSTRG.10005   MSTRG.10005    chr10 32815479 32817141      +   0   0   0
## MSTRG.10007   MSTRG.10007    chr10 32529623 32567490      - 419 381 397
## MSTRG.1001     MSTRG.1001     chr1 25152267 25155144      + 139 152 150
##              U4  U5  W1  W2  W3  W4  W5     v4_gene_id
## MSTRG.1000  166 105 117  52 126 123  81 Zm00001d028167
## MSTRG.10000   0   0   0   0   0   0   0 Zm00001d023970
## MSTRG.10002 106 242 226 212 274 218 212 Zm00001d023969
## MSTRG.10005   0   0   0   0   0   1   0 Zm00001d023973
## MSTRG.10007 223 395 423 345 406 435 375 Zm00001d023968
## MSTRG.1001   81 190 167  92 110 182 148 Zm00001d028171
##                                                            gene_function
## MSTRG.1000           Cytochrome b561 and DOMON domain-containing protein
## MSTRG.10000                                                           NA
## MSTRG.10002                         A/G-specific adenine DNA glycosylase
## MSTRG.10005                     Cysteine proteinases superfamily protein
## MSTRG.10007                                    Glutathione S-transferase
## MSTRG.1001  Calcium-dependent lipid-binding (CaLB domain) family protein
counts <- data[,6:15]
require(e1071)
data_diagnosis(counts)
## skewness is:
##  40.72006 45.82615 23.87964 39.31152 25.54488 46.11351 37.21044 42.03261 29.757 61.20547 
## kurtosis is:
##  2725.111 3708.717 1079.134 2462.335 1098.406 3271.049 2443.01 2906.647 1718.613 6269.872

Data transformation

Data is skewed: most values are around 0, but few outliers have high values.

We need to do data transformations for PCA plot, otherwise outliers will have a great impact on the clustering.

log transformation: logcounts = log2(counts + 1)

DESeq2::rlog(): “regularized log” transformation. For more information see https://rdrr.io/bioc/DESeq2/man/rlog.html

edgeR::cpm(): “counts per million” transformation. For more information see https://rdrr.io/bioc/edgeR/man/cpm.html

DESeq2:varianceStabilizingTransformation(): “variance stabilizing transformation”. For more information see https://rdrr.io/bioc/DESeq2/man/varianceStabilizingTransformation.html

require(DESeq2)
require(edgeR)
logcounts = log2(counts + 1)
rlogcounts = rlog(as.matrix(counts))
rownames(rlogcounts) = rownames(logcounts)
cpmcounts = cpm(as.matrix(counts), prior.count = 2, log = TRUE)
vstcounts = varianceStabilizingTransformation(as.matrix(counts))
data_diagnosis(logcounts)
## skewness is:
##  -0.06744175 -0.06788012 -0.07223585 0.02643566 -0.06926398 -0.06282771 -0.002090134 -0.03638436 -0.05022853 -0.02295028 
## kurtosis is:
##  -1.45588 -1.470203 -1.478033 -1.442948 -1.466676 -1.457927 -1.481011 -1.476796 -1.485275 -1.481922

data_diagnosis(rlogcounts)
## skewness is:
##  -0.107869 -0.1180331 -0.1177119 -0.1130238 -0.1137576 -0.1143162 -0.1122595 -0.1080767 -0.1176406 -0.1094335 
## kurtosis is:
##  -1.406649 -1.417596 -1.420769 -1.410637 -1.411501 -1.410185 -1.415751 -1.410148 -1.42091 -1.415035

data_diagnosis(cpmcounts)
## skewness is:
##  0.07228723 0.03930489 0.03457964 0.05780575 0.0485479 0.04905025 0.05757718 0.07313324 0.03478149 0.06931923 
## kurtosis is:
##  -1.418314 -1.44688 -1.460882 -1.429948 -1.437775 -1.430747 -1.461798 -1.442083 -1.467098 -1.453814

data_diagnosis(vstcounts)
## skewness is:
##  0.691676 0.6513708 0.6538715 0.6695807 0.6746568 0.6729289 0.6653166 0.6836064 0.6459285 0.6685212 
## kurtosis is:
##  -0.3981678 -0.5048652 -0.5732651 -0.4227583 -0.4496328 -0.4397634 -0.5113099 -0.4320609 -0.5695674 -0.5032185

Principle Component Analysis (PCA)

require(graphics)
require(RColorBrewer)
par(mfrow=c(2,3), mar=c(5.1, 4.6, 4.1, 1.6))
draw_PCA(counts, title = "PCA on raw data")
draw_PCA(logcounts, title = "PCA on log transformed data")
draw_PCA(rlogcounts, title = "PCA on rlog transformed data")
draw_PCA(cpmcounts, title = "PCA on cpm transformed data")
draw_PCA(vstcounts, title = "PCA on vst transformed data")

Draw multidimensional scaling (MDS)

par(mfrow=c(2,3))
plotMDS(counts, col = c(rep("red", 5), rep("blue", 5)), cex = 1.5)
title("MDS plot on raw data")
plotMDS(logcounts, col = c(rep("red", 5), rep("blue", 5)), cex = 1.5)
title("MDS plot on log transformed data")
plotMDS(rlogcounts, col = c(rep("red", 5), rep("blue", 5)), cex = 1.5)
title("MDS plot on rlog transformed data")
plotMDS(cpmcounts, col = c(rep("red", 5), rep("blue", 5)), cex = 1.5)
title("MDS plot on cpm transformed data")
plotMDS(vstcounts, col = c(rep("red", 5), rep("blue", 5)), cex = 1.5)
title("MDS plot on vst transformed data")

Draw correlation heatmap

require(gplots)
draw_corr_heatmap(as.matrix(counts), show_cellnote = TRUE, title = "clustering sample-to-sample\n distance on raw data")

draw_corr_heatmap(as.matrix(logcounts), show_cellnote = TRUE, title = "clustering sample-to-sample\n distance on log transformed data")

draw_corr_heatmap(rlogcounts, show_cellnote = TRUE, title = "clustering sample-to-sample\n distance on rlog transformed data")

draw_corr_heatmap(cpmcounts, show_cellnote = TRUE, title = "clustering sample-to-sample\n distance on cpm transformed data")

draw_corr_heatmap(vstcounts, show_cellnote = TRUE, title = "clustering sample-to-sample\n distance on vst transformed data")

Set conditions for DE analysis

conditions = factor(c(rep("Ufo",5), rep("Wt",5)))

Filter low-count reads

keep <- rowSums(cpm(counts)>1) >= 5
table(keep)
## keep
## FALSE  TRUE 
## 17671 22409
keep_true = data.frame(keep[which(keep == TRUE)])
filter = subset(counts, rownames(counts) %in% rownames(keep_true))

DESeq

require(DESeq)
cds = newCountDataSet(filter, conditions)
cds = estimateSizeFactors(cds)
cds = estimateDispersions(cds)
DESeq_res = nbinomTest(cds, "Wt", "Ufo")
rownames(DESeq_res) = DESeq_res$id
DESeq_DE = subset(DESeq_res, (log2FoldChange < -1 & padj < 0.05) | (log2FoldChange > 1 & padj < 0.05) )
DESeq_nc = counts(cds, normalized = TRUE)
DESeq_nc = data.frame("id"=rownames(DESeq_nc), DESeq_nc)
DESeq_nc_DE = subset(DESeq_nc, id %in% DESeq_DE$id)

DESeq2

require(DESeq2)
colData = data.frame(samples=colnames(filter), conditions=conditions)
dds = DESeqDataSetFromMatrix(countData = filter, colData = colData, design = ~conditions)
## converting counts to integer mode
dds = DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
DESeq2_res = results(dds)
DESeq2_DE = subset(data.frame(DESeq2_res), (log2FoldChange < -1 & padj < 0.05) | (log2FoldChange > 1 & padj < 0.05) )
DESeq2_DE = data.frame("id"=rownames(DESeq2_DE), DESeq2_DE)
DESeq2_nc = counts(dds, normalized = TRUE)
DESeq2_nc = data.frame("id" = rownames(DESeq2_nc), DESeq2_nc)
DESeq2_nc_DE = subset(DESeq2_nc, id %in% DESeq2_DE$id)

edgeR

require(edgeR)
group = as.vector(conditions)
dge = DGEList(counts = filter, group = group)
dge = calcNormFactors(dge)
dis = estimateCommonDisp(dge)
tag = estimateTagwiseDisp(dis)
etx = exactTest(tag)
edgeR_res = etx$table
edgeR_res$FDR = p.adjust(edgeR_res$PValue, method = "BH")
edgeR_DE = subset(edgeR_res, (logFC < -1 & FDR < 0.05) | (logFC > 1 & FDR < 0.05) )
edgeR_DE = data.frame("id" = rownames(edgeR_DE), edgeR_DE)
edgeR_nc = tag$pseudo.counts
edgeR_nc = data.frame("id"=rownames(edgeR_nc), edgeR_nc)
edgeR_nc_DE = subset(edgeR_nc, id %in% edgeR_DE$id)

limma

require(limma)
design = model.matrix(~conditions)
voom = voom(filter, design, normalize="quantile")
fit = lmFit(voom, design)
fit = eBayes(fit)
limma_res = topTable(fit, coef = NULL, n=Inf)
## Removing intercept from test coefficients
limma_DE = subset(limma_res, (logFC < -1 & adj.P.Val < 0.05) | (logFC > 1 & adj.P.Val < 0.05) )
limma_DE = data.frame("id"=rownames(limma_DE), limma_DE)
limma_nc = 2**voom$E
limma_nc = data.frame("id"=rownames(limma_nc), limma_nc)
limma_nc_DE = subset(limma_nc, id %in% limma_DE$id)

Compare DESeq, DESeq2, edgeR and limma DEG results

dflist <- list(DESeq=DESeq_DE, DESeq2=DESeq2_DE, edgeR=edgeR_DE, limma=limma_DE)
Compare <- join_id(dflist)
## Loading required package: plyr
## 
## Attaching package: 'plyr'
## The following object is masked from 'package:matrixStats':
## 
##     count
## The following object is masked from 'package:IRanges':
## 
##     desc
## The following object is masked from 'package:S4Vectors':
## 
##     rename
Compare$message
## [1] "There are 315 genes DE in all 4 methods 86 genes in 3, 186 genes in 2, 81 genes in 1."

displaying table using DT library

library(DT)
summary <- merge(Compare$merged_table, annotations, by.x = "id", by.y = "MSTRG_gene_id", all.x = TRUE)
datatable(summary)

Draw Venndiagram

require(VennDiagram)
## Loading required package: VennDiagram
## Loading required package: futile.logger
draw_venndiagram(dflist, Compare$merged_table)

Draw heatmap for DEGs in separate plots

require(ComplexHeatmap)
Ht1 = DE_heatmap(DESeq_nc_DE, title = "DESeq", km = 2)
Ht2 = DE_heatmap(DESeq2_nc_DE, title = "DESeq2", km = 2)
Ht3 = DE_heatmap(edgeR_nc_DE, title = "edgeR", km = 2)
Ht4 = DE_heatmap(limma_nc_DE, title = "limma", km = 2)
Ht1

Ht2

Ht3

Ht4

Draw all heatmaps in one panel

Ht1 = DE_heatmap(DESeq_nc_DE, common_id=Compare$common_id, title = "DESeq", km = 2)
Ht2 = DE_heatmap(DESeq2_nc_DE, common_id=Compare$common_id,  title = "DESeq2", km = 2)
Ht3 = DE_heatmap(edgeR_nc_DE, common_id=Compare$common_id, title = "edgeR", km = 2)
Ht4 = DE_heatmap(limma_nc_DE, common_id=Compare$common_id, title = "limma", km = 2)

Ht1 + Ht2 + Ht3 + Ht4

Draw MA plot

par(mfrow=c(2,2), mar=c(5.1, 4.6, 4.1, 1.6))
draw_MA(DESeq_res, type="DESeq")
draw_MA(DESeq2_res, type="DESeq2")
draw_MA(edgeR_res, type="edgeR")
draw_MA(limma_res, type="limma")

Draw Volcano plot

par(mfrow=c(2,2), mar=c(5.1, 4.6, 4.1, 1.6))
draw_volcano(DESeq_res, type="DESeq")
draw_volcano(DESeq2_res, type="DESeq2")
draw_volcano(edgeR_res, type="edgeR")
draw_volcano(limma_res, type="limma")

sessionInfo()
## R version 3.5.1 (2018-07-02)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS  10.14
## 
## Matrix products: default
## BLAS: /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib
## LAPACK: /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libLAPACK.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
##  [1] grid      stats4    parallel  stats     graphics  grDevices utils    
##  [8] datasets  methods   base     
## 
## other attached packages:
##  [1] VennDiagram_1.6.20          futile.logger_1.4.3        
##  [3] plyr_1.8.4                  ComplexHeatmap_1.20.0      
##  [5] e1071_1.7-0                 data.table_1.11.8          
##  [7] DESeq2_1.22.1               SummarizedExperiment_1.12.0
##  [9] DelayedArray_0.8.0          BiocParallel_1.16.0        
## [11] matrixStats_0.54.0          GenomicRanges_1.34.0       
## [13] GenomeInfoDb_1.18.1         IRanges_2.16.0             
## [15] S4Vectors_0.20.1            DESeq_1.34.0               
## [17] lattice_0.20-38             locfit_1.5-9.1             
## [19] Biobase_2.42.0              BiocGenerics_0.28.0        
## [21] cowplot_0.9.3               DT_0.5                     
## [23] reshape2_1.4.3              ggplot2_3.1.0              
## [25] gplots_3.0.1                RColorBrewer_1.1-2         
## [27] pacman_0.5.0                edgeR_3.24.0               
## [29] limma_3.38.2               
## 
## loaded via a namespace (and not attached):
##  [1] colorspace_1.3-2       rjson_0.2.20           class_7.3-14          
##  [4] rprojroot_1.3-2        circlize_0.4.5         htmlTable_1.12        
##  [7] XVector_0.22.0         GlobalOptions_0.1.0    base64enc_0.1-3       
## [10] rstudioapi_0.8         bit64_0.9-7            AnnotationDbi_1.44.0  
## [13] splines_3.5.1          geneplotter_1.60.0     knitr_1.20            
## [16] Formula_1.2-3          jsonlite_1.5           annotate_1.60.0       
## [19] cluster_2.0.7-1        shiny_1.2.0            compiler_3.5.1        
## [22] backports_1.1.2        assertthat_0.2.0       Matrix_1.2-15         
## [25] lazyeval_0.2.1         formatR_1.5            later_0.7.5           
## [28] acepack_1.4.1          htmltools_0.3.6        tools_3.5.1           
## [31] bindrcpp_0.2.2         gtable_0.2.0           glue_1.3.0            
## [34] GenomeInfoDbData_1.2.0 dplyr_0.7.8            Rcpp_1.0.0            
## [37] gdata_2.18.0           crosstalk_1.0.0        stringr_1.3.1         
## [40] mime_0.6               gtools_3.8.1           XML_3.98-1.16         
## [43] zlibbioc_1.28.0        scales_1.0.0           promises_1.0.1        
## [46] lambda.r_1.2.3         yaml_2.2.0             curl_3.2              
## [49] memoise_1.1.0          gridExtra_2.3          rpart_4.1-13          
## [52] latticeExtra_0.6-28    stringi_1.2.5          RSQLite_2.1.1         
## [55] genefilter_1.64.0      checkmate_1.8.5        caTools_1.17.1.1      
## [58] shape_1.4.4            rlang_0.3.0.1          pkgconfig_2.0.2       
## [61] bitops_1.0-6           evaluate_0.12          purrr_0.2.5           
## [64] bindr_0.1.1            htmlwidgets_1.3        labeling_0.3          
## [67] bit_1.1-14             tidyselect_0.2.5       magrittr_1.5          
## [70] R6_2.3.0               Hmisc_4.1-1            DBI_1.0.0             
## [73] pillar_1.3.0           foreign_0.8-71         withr_2.1.2           
## [76] survival_2.43-1        RCurl_1.95-4.11        nnet_7.3-12           
## [79] tibble_1.4.2           crayon_1.3.4           futile.options_1.0.1  
## [82] KernSmooth_2.23-15     rmarkdown_1.10         GetoptLong_0.1.7      
## [85] blob_1.1.1             digest_0.6.18          xtable_1.8-3          
## [88] httpuv_1.4.5           munsell_0.5.0